//go:build !noasm && amd64
// AUTO-GENERATED BY GOAT -- DO NOT EDIT

TEXT ·dot_512(SB), $0-32
	MOVQ a+0(FP), DI
	MOVQ b+8(FP), SI
	MOVQ res+16(FP), DX
	MOVQ len+24(FP), CX
	BYTE $0x55               // pushq	%rbp
	WORD $0x8948; BYTE $0xe5 // movq	%rsp, %rbp
	LONG $0xf8e48348         // andq	$-8, %rsp
	WORD $0x8b48; BYTE $0x01 // movq	(%rcx), %rax
	WORD $0xf883; BYTE $0x07 // cmpl	$7, %eax
	JG   LBB0_6
	LONG $0xff408d44         // leal	-1(%rax), %r8d
	WORD $0x03a8             // testb	$3, %al
	JE   LBB0_15
	WORD $0x8941; BYTE $0xc1 // movl	%eax, %r9d
	LONG $0x03e18341         // andl	$3, %r9d
	LONG $0xc057f8c5         // vxorps	%xmm0, %xmm0, %xmm0
	WORD $0xc931             // xorl	%ecx, %ecx

LBB0_3:
	LONG $0x0f10fac5             // vmovss	(%rdi), %xmm1
	LONG $0xb971e2c4; BYTE $0x06 // vfmadd231ss	(%rsi), %xmm1, %xmm0
	LONG $0x04c78348             // addq	$4, %rdi
	LONG $0x04c68348             // addq	$4, %rsi
	LONG $0x01c18348             // addq	$1, %rcx
	WORD $0x3941; BYTE $0xc9     // cmpl	%ecx, %r9d
	JNE  LBB0_3
	WORD $0xc829                 // subl	%ecx, %eax
	LONG $0x03f88341             // cmpl	$3, %r8d
	JAE  LBB0_16
	JMP  LBB0_5

LBB0_6:
	LONG $0xc057f8c5             // vxorps	%xmm0, %xmm0, %xmm0
	LONG $0x0000803d; BYTE $0x00 // cmpl	$128, %eax
	JB   LBB0_13
	LONG $0xc957f0c5             // vxorps	%xmm1, %xmm1, %xmm1
	LONG $0xd257e8c5             // vxorps	%xmm2, %xmm2, %xmm2
	LONG $0xdb57e0c5             // vxorps	%xmm3, %xmm3, %xmm3
	LONG $0xed57d0c5             // vxorps	%xmm5, %xmm5, %xmm5
	LONG $0xe457d8c5             // vxorps	%xmm4, %xmm4, %xmm4
	LONG $0xf657c8c5             // vxorps	%xmm6, %xmm6, %xmm6
	LONG $0xff57c0c5             // vxorps	%xmm7, %xmm7, %xmm7
	LONG $0x573841c4; BYTE $0xc0 // vxorps	%xmm8, %xmm8, %xmm8

LBB0_8:
	LONG $0x487c7162; WORD $0x0f10             // vmovups	(%rdi), %zmm9
	LONG $0x487c7162; WORD $0x5710; BYTE $0x01 // vmovups	64(%rdi), %zmm10
	LONG $0x487c7162; WORD $0x5f10; BYTE $0x02 // vmovups	128(%rdi), %zmm11
	LONG $0x487c7162; WORD $0x6710; BYTE $0x03 // vmovups	192(%rdi), %zmm12
	LONG $0x487c7162; WORD $0x6f10; BYTE $0x04 // vmovups	256(%rdi), %zmm13
	LONG $0x487c7162; WORD $0x7710; BYTE $0x05 // vmovups	320(%rdi), %zmm14
	LONG $0x487c7162; WORD $0x7f10; BYTE $0x06 // vmovups	384(%rdi), %zmm15
	LONG $0x4835f262; WORD $0x0eb8             // vfmadd231ps	(%rsi), %zmm9, %zmm1
	LONG $0x482df262; WORD $0x56b8; BYTE $0x01 // vfmadd231ps	64(%rsi), %zmm10, %zmm2
	LONG $0x4825f262; WORD $0x5eb8; BYTE $0x02 // vfmadd231ps	128(%rsi), %zmm11, %zmm3
	LONG $0x481df262; WORD $0x6eb8; BYTE $0x03 // vfmadd231ps	192(%rsi), %zmm12, %zmm5
	LONG $0x4815f262; WORD $0x66b8; BYTE $0x04 // vfmadd231ps	256(%rsi), %zmm13, %zmm4
	LONG $0x480df262; WORD $0x76b8; BYTE $0x05 // vfmadd231ps	320(%rsi), %zmm14, %zmm6
	LONG $0x4805f262; WORD $0x7eb8; BYTE $0x06 // vfmadd231ps	384(%rsi), %zmm15, %zmm7
	LONG $0x487c7162; WORD $0x4f10; BYTE $0x07 // vmovups	448(%rdi), %zmm9
	LONG $0x48357262; WORD $0x46b8; BYTE $0x07 // vfmadd231ps	448(%rsi), %zmm9, %zmm8
	WORD $0xc083; BYTE $0x80                   // addl	$-128, %eax
	LONG $0x00c78148; WORD $0x0002; BYTE $0x00 // addq	$512, %rdi
	LONG $0x00c68148; WORD $0x0002; BYTE $0x00 // addq	$512, %rsi
	WORD $0xbe0f; BYTE $0xc8                   // movsbl	%al, %ecx
	WORD $0xc139                               // cmpl	%eax, %ecx
	JNE  LBB0_8
	LONG $0x4874f162; WORD $0xca58             // vaddps	%zmm2, %zmm1, %zmm1
	LONG $0x4864f162; WORD $0xd558             // vaddps	%zmm5, %zmm3, %zmm2
	LONG $0x4874f162; WORD $0xca58             // vaddps	%zmm2, %zmm1, %zmm1
	LONG $0x485cf162; WORD $0xd658             // vaddps	%zmm6, %zmm4, %zmm2
	LONG $0x4844d162; WORD $0xd858             // vaddps	%zmm8, %zmm7, %zmm3
	LONG $0x486cf162; WORD $0xd358             // vaddps	%zmm3, %zmm2, %zmm2
	LONG $0x4874f162; WORD $0xca58             // vaddps	%zmm2, %zmm1, %zmm1
	LONG $0x48fdf362; WORD $0xcb1b; BYTE $0x01 // vextractf64x4	$1, %zmm1, %ymm3
	LONG $0xd257e8c5                           // vxorps	%xmm2, %xmm2, %xmm2
	LONG $0xca58f4c5                           // vaddps	%ymm2, %ymm1, %ymm1
	LONG $0xcb58f4c5                           // vaddps	%ymm3, %ymm1, %ymm1
	WORD $0xc085                               // testl	%eax, %eax
	JE   LBB0_18
	WORD $0xf883; BYTE $0x20                   // cmpl	$32, %eax
	JB   LBB0_14

LBB0_11:
	LONG $0xe0488d44               // leal	-32(%rax), %r9d
	LONG $0x20c1f641               // testb	$32, %r9b
	JNE  LBB0_19
	LONG $0x2710fcc5               // vmovups	(%rdi), %ymm4
	LONG $0x4710fcc5; BYTE $0x20   // vmovups	32(%rdi), %ymm0
	LONG $0x5f10fcc5; BYTE $0x40   // vmovups	64(%rdi), %ymm3
	LONG $0x5710fcc5; BYTE $0x60   // vmovups	96(%rdi), %ymm2
	LONG $0xb85de2c4; BYTE $0x0e   // vfmadd231ps	(%rsi), %ymm4, %ymm1
	LONG $0xe457d8c5               // vxorps	%xmm4, %xmm4, %xmm4
	LONG $0x985de2c4; WORD $0x2046 // vfmadd132ps	32(%rsi), %ymm4, %ymm0
	LONG $0x985de2c4; WORD $0x405e // vfmadd132ps	64(%rsi), %ymm4, %ymm3
	LONG $0x985de2c4; WORD $0x6056 // vfmadd132ps	96(%rsi), %ymm4, %ymm2
	LONG $0x80ef8348               // subq	$-128, %rdi
	LONG $0x80ee8348               // subq	$-128, %rsi
	WORD $0x8944; BYTE $0xc8       // movl	%r9d, %eax
	LONG $0x20f98341               // cmpl	$32, %r9d
	JAE  LBB0_20
	JMP  LBB0_22

LBB0_13:
	LONG $0xc957f0c5         // vxorps	%xmm1, %xmm1, %xmm1
	WORD $0xf883; BYTE $0x20 // cmpl	$32, %eax
	JAE  LBB0_11

LBB0_14:
	LONG $0xdb57e0c5 // vxorps	%xmm3, %xmm3, %xmm3
	LONG $0xd257e8c5 // vxorps	%xmm2, %xmm2, %xmm2
	JMP  LBB0_21

LBB0_15:
	LONG $0xc057f8c5 // vxorps	%xmm0, %xmm0, %xmm0
	LONG $0x03f88341 // cmpl	$3, %r8d
	JB   LBB0_5

LBB0_16:
	WORD $0xc089 // movl	%eax, %eax
	WORD $0xc931 // xorl	%ecx, %ecx

LBB0_17:
	LONG $0x0c10fac5; BYTE $0x8f               // vmovss	(%rdi,%rcx,4), %xmm1
	LONG $0x5410fac5; WORD $0x048f             // vmovss	4(%rdi,%rcx,4), %xmm2
	LONG $0x9979e2c4; WORD $0x8e0c             // vfmadd132ss	(%rsi,%rcx,4), %xmm0, %xmm1
	LONG $0xb969e2c4; WORD $0x8e4c; BYTE $0x04 // vfmadd231ss	4(%rsi,%rcx,4), %xmm2, %xmm1
	LONG $0x5410fac5; WORD $0x088f             // vmovss	8(%rdi,%rcx,4), %xmm2
	LONG $0x9971e2c4; WORD $0x8e54; BYTE $0x08 // vfmadd132ss	8(%rsi,%rcx,4), %xmm1, %xmm2
	LONG $0x4410fac5; WORD $0x0c8f             // vmovss	12(%rdi,%rcx,4), %xmm0
	LONG $0x9969e2c4; WORD $0x8e44; BYTE $0x0c // vfmadd132ss	12(%rsi,%rcx,4), %xmm2, %xmm0
	LONG $0x04c18348                           // addq	$4, %rcx
	WORD $0xc839                               // cmpl	%ecx, %eax
	JNE  LBB0_17

LBB0_5:
	LONG $0x0211fac5         // vmovss	%xmm0, (%rdx)
	WORD $0x8948; BYTE $0xec // movq	%rbp, %rsp
	BYTE $0x5d               // popq	%rbp
	BYTE $0xc3               // retq

LBB0_18:
	LONG $0xc258f4c5               // vaddps	%ymm2, %ymm1, %ymm0
	LONG $0xc07cffc5               // vhaddps	%ymm0, %ymm0, %ymm0
	LONG $0xc07cffc5               // vhaddps	%ymm0, %ymm0, %ymm0
	LONG $0x197de3c4; WORD $0x01c1 // vextractf128	$1, %ymm0, %xmm1
	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
	LONG $0xc957f0c5               // vxorps	%xmm1, %xmm1, %xmm1
	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
	LONG $0x0211fac5               // vmovss	%xmm0, (%rdx)
	WORD $0x8948; BYTE $0xec       // movq	%rbp, %rsp
	BYTE $0x5d                     // popq	%rbp
	WORD $0xf8c5; BYTE $0x77       // vzeroupper
	BYTE $0xc3                     // retq

LBB0_19:
	LONG $0xd257e8c5 // vxorps	%xmm2, %xmm2, %xmm2
	LONG $0xdb57e0c5 // vxorps	%xmm3, %xmm3, %xmm3
	LONG $0xc057f8c5 // vxorps	%xmm0, %xmm0, %xmm0
	LONG $0x20f98341 // cmpl	$32, %r9d
	JB   LBB0_22

LBB0_20:
	LONG $0x2710fcc5                           // vmovups	(%rdi), %ymm4
	LONG $0x6f10fcc5; BYTE $0x20               // vmovups	32(%rdi), %ymm5
	LONG $0x7710fcc5; BYTE $0x40               // vmovups	64(%rdi), %ymm6
	LONG $0x7f10fcc5; BYTE $0x60               // vmovups	96(%rdi), %ymm7
	LONG $0x9875e2c4; BYTE $0x26               // vfmadd132ps	(%rsi), %ymm1, %ymm4
	LONG $0x987de2c4; WORD $0x206e             // vfmadd132ps	32(%rsi), %ymm0, %ymm5
	LONG $0x9865e2c4; WORD $0x4076             // vfmadd132ps	64(%rsi), %ymm3, %ymm6
	LONG $0x986de2c4; WORD $0x607e             // vfmadd132ps	96(%rsi), %ymm2, %ymm7
	QUAD $0x000000808f10fcc5                   // vmovups	128(%rdi), %ymm1
	QUAD $0x000000a08710fcc5                   // vmovups	160(%rdi), %ymm0
	QUAD $0x000000c09f10fcc5                   // vmovups	192(%rdi), %ymm3
	QUAD $0x000000e09710fcc5                   // vmovups	224(%rdi), %ymm2
	QUAD $0x0000808e985de2c4; BYTE $0x00       // vfmadd132ps	128(%rsi), %ymm4, %ymm1
	QUAD $0x0000a0869855e2c4; BYTE $0x00       // vfmadd132ps	160(%rsi), %ymm5, %ymm0
	QUAD $0x0000c09e984de2c4; BYTE $0x00       // vfmadd132ps	192(%rsi), %ymm6, %ymm3
	QUAD $0x0000e0969845e2c4; BYTE $0x00       // vfmadd132ps	224(%rsi), %ymm7, %ymm2
	WORD $0xc083; BYTE $0xc0                   // addl	$-64, %eax
	LONG $0x00c78148; WORD $0x0001; BYTE $0x00 // addq	$256, %rdi
	LONG $0x00c68148; WORD $0x0001; BYTE $0x00 // addq	$256, %rsi
	WORD $0xf883; BYTE $0x1f                   // cmpl	$31, %eax
	JA   LBB0_20

LBB0_21:
	WORD $0x8941; BYTE $0xc1 // movl	%eax, %r9d

LBB0_22:
	LONG $0x08f98341         // cmpl	$8, %r9d
	JB   LBB0_29
	LONG $0xf8418d45         // leal	-8(%r9), %r8d
	WORD $0x8944; BYTE $0xc0 // movl	%r8d, %eax
	WORD $0xe8c1; BYTE $0x03 // shrl	$3, %eax
	WORD $0x488d; BYTE $0x01 // leal	1(%rax), %ecx
	WORD $0xc1f6; BYTE $0x03 // testb	$3, %cl
	JE   LBB0_27
	WORD $0x0104             // addb	$1, %al
	WORD $0xb60f; BYTE $0xc0 // movzbl	%al, %eax
	WORD $0xe083; BYTE $0x03 // andl	$3, %eax
	LONG $0x03e0c148         // shlq	$3, %rax
	WORD $0xc931             // xorl	%ecx, %ecx

LBB0_25:
	LONG $0x2710fcc5             // vmovups	(%rdi), %ymm4
	LONG $0xb85de2c4; BYTE $0x0e // vfmadd231ps	(%rsi), %ymm4, %ymm1
	LONG $0x20c78348             // addq	$32, %rdi
	LONG $0x20c68348             // addq	$32, %rsi
	LONG $0x08c18348             // addq	$8, %rcx
	WORD $0xc839                 // cmpl	%ecx, %eax
	JNE  LBB0_25
	WORD $0x2941; BYTE $0xc9     // subl	%ecx, %r9d

LBB0_27:
	LONG $0x18f88341 // cmpl	$24, %r8d
	JB   LBB0_29

LBB0_28:
	LONG $0x2710fcc5               // vmovups	(%rdi), %ymm4
	LONG $0x6f10fcc5; BYTE $0x20   // vmovups	32(%rdi), %ymm5
	LONG $0x7710fcc5; BYTE $0x40   // vmovups	64(%rdi), %ymm6
	LONG $0x7f10fcc5; BYTE $0x60   // vmovups	96(%rdi), %ymm7
	LONG $0x9875e2c4; BYTE $0x26   // vfmadd132ps	(%rsi), %ymm1, %ymm4
	LONG $0xb855e2c4; WORD $0x2066 // vfmadd231ps	32(%rsi), %ymm5, %ymm4
	LONG $0xb84de2c4; WORD $0x4066 // vfmadd231ps	64(%rsi), %ymm6, %ymm4
	LONG $0xcc28fcc5               // vmovaps	%ymm4, %ymm1
	LONG $0xb845e2c4; WORD $0x604e // vfmadd231ps	96(%rsi), %ymm7, %ymm1
	LONG $0xe0c18341               // addl	$-32, %r9d
	LONG $0x80ef8348               // subq	$-128, %rdi
	LONG $0x80ee8348               // subq	$-128, %rsi
	LONG $0x07f98341               // cmpl	$7, %r9d
	JA   LBB0_28

LBB0_29:
	WORD $0x8545; BYTE $0xc9 // testl	%r9d, %r9d
	JE   LBB0_34
	LONG $0xff418d45         // leal	-1(%r9), %r8d
	LONG $0x03c1f641         // testb	$3, %r9b
	JE   LBB0_35
	WORD $0x8944; BYTE $0xc9 // movl	%r9d, %ecx
	WORD $0xe183; BYTE $0x03 // andl	$3, %ecx
	LONG $0xe457d8c5         // vxorps	%xmm4, %xmm4, %xmm4
	WORD $0xc031             // xorl	%eax, %eax

LBB0_32:
	LONG $0x2f10fac5             // vmovss	(%rdi), %xmm5
	LONG $0xb951e2c4; BYTE $0x26 // vfmadd231ss	(%rsi), %xmm5, %xmm4
	LONG $0x04c78348             // addq	$4, %rdi
	LONG $0x04c68348             // addq	$4, %rsi
	LONG $0x01c08348             // addq	$1, %rax
	WORD $0xc139                 // cmpl	%eax, %ecx
	JNE  LBB0_32
	WORD $0x2941; BYTE $0xc1     // subl	%eax, %r9d
	LONG $0x03f88341             // cmpl	$3, %r8d
	JAE  LBB0_36
	JMP  LBB0_38

LBB0_34:
	LONG $0xe457d8c5 // vxorps	%xmm4, %xmm4, %xmm4
	JMP  LBB0_38

LBB0_35:
	LONG $0xe457d8c5 // vxorps	%xmm4, %xmm4, %xmm4
	LONG $0x03f88341 // cmpl	$3, %r8d
	JB   LBB0_38

LBB0_36:
	WORD $0x8944; BYTE $0xc8 // movl	%r9d, %eax
	WORD $0xc931             // xorl	%ecx, %ecx

LBB0_37:
	LONG $0x2c10fac5; BYTE $0x8f               // vmovss	(%rdi,%rcx,4), %xmm5
	LONG $0x7410fac5; WORD $0x048f             // vmovss	4(%rdi,%rcx,4), %xmm6
	LONG $0x9959e2c4; WORD $0x8e2c             // vfmadd132ss	(%rsi,%rcx,4), %xmm4, %xmm5
	LONG $0xb949e2c4; WORD $0x8e6c; BYTE $0x04 // vfmadd231ss	4(%rsi,%rcx,4), %xmm6, %xmm5
	LONG $0x7410fac5; WORD $0x088f             // vmovss	8(%rdi,%rcx,4), %xmm6
	LONG $0x9951e2c4; WORD $0x8e74; BYTE $0x08 // vfmadd132ss	8(%rsi,%rcx,4), %xmm5, %xmm6
	LONG $0x6410fac5; WORD $0x0c8f             // vmovss	12(%rdi,%rcx,4), %xmm4
	LONG $0x9949e2c4; WORD $0x8e64; BYTE $0x0c // vfmadd132ss	12(%rsi,%rcx,4), %xmm6, %xmm4
	LONG $0x04c18348                           // addq	$4, %rcx
	WORD $0xc839                               // cmpl	%ecx, %eax
	JNE  LBB0_37

LBB0_38:
	LONG $0xc158fcc5               // vaddps	%ymm1, %ymm0, %ymm0
	LONG $0xca58e4c5               // vaddps	%ymm2, %ymm3, %ymm1
	LONG $0xc058f4c5               // vaddps	%ymm0, %ymm1, %ymm0
	LONG $0xc07cffc5               // vhaddps	%ymm0, %ymm0, %ymm0
	LONG $0xc07cffc5               // vhaddps	%ymm0, %ymm0, %ymm0
	LONG $0x197de3c4; WORD $0x01c1 // vextractf128	$1, %ymm0, %xmm1
	LONG $0xc158fac5               // vaddss	%xmm1, %xmm0, %xmm0
	LONG $0xc058dac5               // vaddss	%xmm0, %xmm4, %xmm0
	LONG $0x0211fac5               // vmovss	%xmm0, (%rdx)
	WORD $0x8948; BYTE $0xec       // movq	%rbp, %rsp
	BYTE $0x5d                     // popq	%rbp
	WORD $0xf8c5; BYTE $0x77       // vzeroupper
	BYTE $0xc3                     // retq
